library(data.table)
data.table 1.15.2 using 80 threads (see ?getDTthreads). Latest news: r-datatable.com
library(readxl)
library(ggplot2)
library(gprofiler2)
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.1-8
library(ComplexHeatmap)
Loading required package: grid
========================================
ComplexHeatmap version 2.14.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
If you use it in published research, please cite either one:
- Gu, Z. Complex Heatmap Visualization. iMeta 2022.
- Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
genomic data. Bioinformatics 2016.
The new InteractiveComplexHeatmap package can directly export static
complex heatmaps into an interactive Shiny app with zero effort. Have a try!
This message can be suppressed by:
suppressPackageStartupMessages(library(ComplexHeatmap))
========================================
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
library(caret)
Loading required package: lattice
library(mixOmics)
Loading required package: MASS
Loaded mixOmics 6.22.0
Thank you for using mixOmics!
Tutorials: http://mixomics.org
Bookdown vignette: https://mixomicsteam.github.io/Bookdown
Questions, issues: Follow the prompts at http://mixomics.org/contact-us
Cite us: citation('mixOmics')
Attaching package: ‘mixOmics’
The following objects are masked from ‘package:caret’:
nearZeroVar, plsda, splsda
library(infotheo)
library(circlize)
========================================
circlize version 0.4.16
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: https://jokergoo.github.io/circlize_book/book/
If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
in R. Bioinformatics 2014.
This message can be suppressed by:
suppressPackageStartupMessages(library(circlize))
========================================
library(grDevices)
set.seed(101)
# read meta data
meta <- as.data.table(readxl::read_excel("data/transcritome_patients_translated.xlsx"))
meta_key <- "CEL_FILE"
setkeyv(meta, meta_key) # set key
stopifnot(!any( duplicated(meta[,..meta_key]) )) # check for duplicate rows
stopifnot(!any( duplicated(colnames(meta)) )) # check for duplicate columns
# read Feuil1
feuil1 <- as.data.table(readxl::read_excel("data/transcriptome_expression_matrix.xlsx",sheet="Feuil1"))
New names:
# read expression data
expr <- as.data.table(readxl::read_excel("data/transcriptome_expression_matrix.xlsx"))
New names:
expr_key <- "gene"
colnames(expr)[1] <- expr_key
setkeyv(expr, expr_key) # set key
stopifnot(!any( duplicated(expr[,..expr_key]) )) # check for duplicate rows
stopifnot(!any( duplicated(colnames(expr)) )) # check for duplicate columns
expr <- expr[, c(key(expr), meta$CEL_FILE), with=F]
stopifnot(all(colnames(expr)[-1]==meta[,CEL_FILE]))
expr <- as.matrix(expr, rownames="gene")
meta_M0 <- meta[LOCATION=="M0"]
meta_MI <- meta[LOCATION=="MI"]
meta_M6 <- meta[LOCATION=="M6"]
meta_M0M6 <- meta[LOCATION=="M6"|LOCATION=="M0"]
meta_M0MI <- meta[LOCATION=="M0"|LOCATION=="MI"]
meta_M0MIC <- meta[LOCATION=="M0"|LOCATION=="MI"|LOCATION=="Ctrl"]
# read mouse signature
signature_dt <- fread("data/signature.csv")
signature_mouse <- signature_dt$gene
setkeyv(signature_dt, "gene")
# read all mouse genes measured with nanostring
measured_genes_mouse <- fread("data/All samples_NormalizedData.csv")[[1]]
# check that all signature genes are part of the measured genes
stopifnot(all(signature_mouse %in% measured_genes_mouse))
# map to human orthologs
ortholog_mapping <- as.data.table(gprofiler2::gorth(measured_genes_mouse, source_organism = "mmusculus", target_organism = "hsapiens", filter_na = F))
stopifnot(all(measured_genes_mouse %in% ortholog_mapping$input))
# mark signature genes and hits in the human expr data
ortholog_mapping$in_expr <- ortholog_mapping$ortholog_name %in% rownames(expr)
ortholog_mapping$in_signature <- ortholog_mapping$input %in% signature_mouse
ortholog_mapping[in_signature==T,mouse:=signature_dt[input,pattern]]
# count number of hits in the human expr data per mouse gene
n_in_expr <- ortholog_mapping[,.("n_hits" = sum(in_expr)),by=input]
# print number of hits in expr$gene per measured mouse gene
table(n_in_expr$n_hits)
0 1 2 3 6 7
48 708 8 2 1 1
# select only mouse genes with orthologs uniquely mapped to the human expression data
unique_ortholog_mapping <- ortholog_mapping[input %in% n_in_expr[n_hits==1,input] & in_expr]
# unmapped signature genes
print(paste0(nrow(unique_ortholog_mapping)," out of ",length(measured_genes_mouse), " measured mouse genes were mapped to the human expression genes"))
[1] "708 out of 768 measured mouse genes were mapped to the human expression genes"
ortholog_mapping[!input %in% unique_ortholog_mapping$input & in_signature]
plot_data <- data.table(sample=colnames(expr))
plot_data[,colSum:=colSums(expr)]
plot_data[,RECURRENCE:=meta$RECURRENCE]
plot_data[,LOCATION:=meta$LOCATION]
plot_data[,GENDER:=meta$GENDER]
ggplot(plot_data,aes(x=colSum,fill=RECURRENCE)) +
geom_density(alpha=0.5)
ggplot(plot_data,aes(x=colSum,fill=LOCATION)) +
geom_density(alpha=0.5)
ggplot(plot_data,aes(x=colSum,fill=GENDER)) +
geom_density(alpha=0.5)
high_var_genes <- sort(apply(expr[,meta$CEL_FILE], 1, sd), decreasing = T)[1:10000]
high_var_expr <- t(expr[names(high_var_genes),meta$CEL_FILE])
high_var_pca <- pca(high_var_expr, ncomp = 3, scale = T)
plotIndiv(high_var_pca, group = meta$LOCATION, ind.names = FALSE,
legend = TRUE, title="PCA - high variance genes", ellipse = T)
signature_pca <- pca(t(expr[unique_ortholog_mapping[in_expr==T & in_signature, ortholog_name], meta$CEL_FILE]), ncomp = 3, scale = TRUE)
plotIndiv(signature_pca, group = meta$LOCATION, ind.names = FALSE,
legend = TRUE, title="PCA - signature genes", ellipse = T)
table(meta$RECURRENCE, meta$LOCATION)
Ctrl M0 M6 MI
Ctrl 25 0 0 0
NR 0 57 36 43
R 0 139 85 104
table(meta$LOCATION)
Ctrl M0 M6 MI
25 196 121 147
Ctrl (25) -> Ctrl (25) M0 (196) -> M0I (200) MI (147) -> M0M (149) M6 (121) -> M6 (122) Why do the numbers from publication and meta sheet not match?
What is CENTRE? What are stenose, fistule, inflammatoire, Stoma? What is Postoperative anti-TNF?
Why do RutgeertRec and RECURRENCE not match 1 to 1?
table(meta$RutgeertRec, meta$RECURRENCE)
Ctrl NR R
Ctrl 25 0 0
Rec 0 0 326
Rem 0 136 2
custom_heatmap <- function(expr, meta, genes, column_names = F, scale = T, ...){
if(is.null(meta)){
samples <- colnames(expr)
column_ha = NULL
}else{
meta <- as.data.frame(meta, row.names = meta[[1]])[,-1,drop=F]
samples <- rownames(meta)
column_ha = HeatmapAnnotation(df=meta)
}
genes <- as.data.frame(genes, row.names = genes[[1]])[,-1,drop=F]
gene_names <- rownames(genes)
row_ha = rowAnnotation(
df=genes,
annotation_name_side="top",
#annotation_label=("\u0394/\u0394IEC"),
col = list(mouse= c("up" = "#E8E700", "down" = "#0092F4"))
)
if(scale){
expr <- t(apply(expr[gene_names,samples],1,scale))
colnames(expr) <- samples
}else{
expr <- expr[gene_names,samples]
}
Heatmap(expr,
show_column_names = column_names,
top_annotation = column_ha,
name = "Expression",
row_names_gp = gpar(fontsize = 10),
left_annotation = row_ha,
column_title_side = "top",
...
)
}
custom_heatmap <- function(expr, meta, genes, column_names = F, scale = T, ...){
if(is.null(meta)){
samples <- colnames(expr)
column_ha = NULL
}else{
meta <- as.data.frame(meta, row.names = meta[[1]])[,-1,drop=F]
samples <- rownames(meta)
column_ha = HeatmapAnnotation(df=meta)
}
genes <- as.data.frame(genes, row.names = genes[[1]])[,-1,drop=F]
gene_names <- rownames(genes)
row_ha = rowAnnotation(
df=genes,
annotation_name_side="top",
#annotation_label=("\u0394/\u0394IEC"),
col = list(mouse= c("up" = "#E8E700", "down" = "#0092F4"))
)
if(scale){
expr <- t(apply(expr[gene_names,samples],1,scale))
colnames(expr) <- samples
}else{
expr <- expr[gene_names,samples]
}
Heatmap(expr,
show_column_names = column_names,
top_annotation = column_ha,
name = "Expression",
row_names_gp = gpar(fontsize = 10),
left_annotation = row_ha,
column_title_side = "top",
...
)
}
custom_heatmap(
expr,
meta[LOCATION=="M0",.(CEL_FILE,RECURRENCE,LOCATION,inflammatoire,Smoker,Granuloma,Rutgeert2)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = T)
custom_heatmap(
expr,
meta[LOCATION=="M6",.(CEL_FILE,RECURRENCE,LOCATION,Reecal_Rut=as.numeric(Reeval_Rut))],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = T)
custom_heatmap(
expr,
meta[LOCATION=="M6",.(CEL_FILE,RECURRENCE,LOCATION)][order(RECURRENCE)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = F)
custom_heatmap(
expr,
meta[LOCATION=="M0"|LOCATION=="MI",.(CEL_FILE,LOCATION)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = T)
custom_heatmap(
expr,
meta[LOCATION=="M0"|LOCATION=="MI",.(CEL_FILE,LOCATION)][order(LOCATION)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)],
cluster_columns = F)
custom_heatmap(
expr,
meta[LOCATION=="M0"|LOCATION=="MI",.(CEL_FILE,LOCATION)][order(LOCATION)],
unique_ortholog_mapping[in_expr==T&in_signature==T,.(ortholog_name,mouse)][order(mouse)],
cluster_columns = F, cluster_rows = F)
NA
NA
# MI, M0, Ctrl
expr_scaled <- apply(expr[unique_ortholog_mapping[in_expr==T&in_signature==T,ortholog_name],meta_M0MIC$CEL_FILE] ,1,scale)
rownames(expr_scaled) <- meta_M0MIC$CEL_FILE
summarized_expr <- sapply(unique(meta_M0MIC$LOCATION), function(location){
colMeans(expr_scaled[meta_M0MIC[LOCATION==location,CEL_FILE],])
})
# mutual information
concordance <- data.frame(
mouse = unique_ortholog_mapping[in_signature==T,mouse],
human = ifelse(sign(summarized_expr[unique_ortholog_mapping[in_signature==T,ortholog_name],"M0"])>0, "up", "down")
)
concordance$concordance <- concordance$mouse == concordance$human
mi <- mutinformation(concordance$mouse,concordance$human)
random_mi <- sapply(1:1000, function(i){mutinformation(sample(concordance$mouse),concordance$human)})
mi_pval <- (sum(random_mi>mi) + 1)/(length(random_mi)+1)
paste("concordant genes:",sum(concordance$concordance),"mutual information:",mi,"p-value",mi_pval)
[1] "concordant genes: 23 mutual information: 0.0923613499679372 p-value 0.00699300699300699"
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)] ,column_names=T, scale = F)
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)][order(mouse)],cluster_rows = F, column_names=T, scale = F)
# MI, M0
expr_scaled <- apply(expr[unique_ortholog_mapping[in_expr==T&in_signature==T,ortholog_name],meta_M0MI$CEL_FILE] ,1,scale)
rownames(expr_scaled) <- meta_M0MI$CEL_FILE
summarized_expr <- sapply(unique(meta_M0MI$LOCATION), function(location){
colMeans(expr_scaled[meta_M0MI[LOCATION==location,CEL_FILE],])
})
# mutual information
concordance <- data.frame(
mouse = unique_ortholog_mapping[in_signature==T,mouse],
human = ifelse(sign(summarized_expr[unique_ortholog_mapping[in_signature==T,ortholog_name],"M0"])>0, "up", "down")
)
concordance$concordance <- concordance$mouse == concordance$human
mi <- mutinformation(concordance$mouse,concordance$human)
random_mi <- sapply(1:1000, function(i){mutinformation(sample(concordance$mouse),concordance$human)})
mi_pval <- (sum(random_mi>mi) + 1)/(length(random_mi)+1)
paste("concordant genes:",sum(concordance$concordance),"mutual information:",mi,"p-value",mi_pval)
[1] "concordant genes: 24 mutual information: 0.129536460802155 p-value 0.00599400599400599"
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)] ,column_names=T, scale = F)
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)][order(mouse)],cluster_rows = F, column_names=T, scale = F)
# scale expression data
expr_scaled <- apply(expr[unique_ortholog_mapping[in_signature==T,ortholog_name],meta_M0MI$CEL_FILE] ,1,scale)
rownames(expr_scaled) <- meta_M0MI$CEL_FILE
# mean per location
summarized_expr <- sapply(unique(c("MI","M0")), function(location){
colMeans(expr_scaled[meta_M0MI[LOCATION==location,CEL_FILE],])
})
# concordance data
concordance <- data.frame(
mouse = unique_ortholog_mapping[in_signature==T,mouse],
human = ifelse(sign(summarized_expr[unique_ortholog_mapping[in_signature==T,ortholog_name],"M0"])>0, "up", "down")
)
concordance$concordance <- concordance$mouse == concordance$human
mi <- mutinformation(concordance$mouse,concordance$human)
set.seed(0)
random_mi <- sapply(1:10000, function(i){mutinformation(sample(concordance$mouse),concordance$human)})
mi_pval <- (sum(random_mi>mi) + 1)/(length(random_mi)+1)
paste("concordant genes:",sum(concordance$concordance),"mutual information:",mi,"p-value",mi_pval)
[1] "concordant genes: 24 mutual information: 0.129536460802155 p-value 0.004999500049995"
concordance$Mice <- ifelse(concordance$mouse=="up", "up in \u0394/\u0394IEC", "down in \u0394/\u0394IEC")
grDevices::cairo_pdf("heatmap.pdf", width = 4, height = 7,)
# row annotation
row_ha = rowAnnotation(
Mice=concordance$Mice,
annotation_name_side="top",
annotation_name_rot=0,
col = list(Mice= c("up in \u0394/\u0394IEC" = "#E8E700", "down in \u0394/\u0394IEC" = "#0092F4"))
)
col_fun = colorRamp2(c(-0.5, 0, 0.5), c("blue", "white", "red"))
concordance$col <- "grey"
concordance[concordance$concordance & concordance$mouse=="up",]$col <- "red"
concordance[concordance$concordance & concordance$mouse=="down",]$col <- "blue"
Heatmap(summarized_expr,
name = "Expression",
row_names_gp = gpar(fontsize = 9, col = concordance$col),
left_annotation = row_ha,
column_title_side = "top",
cluster_columns = F,
column_labels = c("M0M","M0I"),
column_names_side = "top",
column_names_rot = 0,
column_names_centered = T,
col = col_fun,
width = unit(3, "cm"),
)
dev.off()
null device
1
custom_heatmap(summarized_expr, NULL, unique_ortholog_mapping[in_signature==T,.(ortholog_name,mouse,concordance=concordance$concordance)] ,column_names=T, scale = F)
high_var_genes <- sort(apply(expr[,meta_M0$CEL_FILE], 1, sd), decreasing = T)[1:5000]
pls_da_expr <- t(expr[names(high_var_genes),meta_M0$CEL_FILE])
#pls_da_expr <- 2 ^ pls_da_expr
pca.expr <- pca(pls_da_expr, ncomp = 3, scale = TRUE)
plotIndiv(pca.expr, group = meta_M0$RECURRENCE, ind.names = FALSE,
legend = TRUE,
title = 'PCA comp 1 - 2')
plsda.expr <- plsda(pls_da_expr, meta_M0$RECURRENCE, ncomp = 10)
perf.plsda.expr <- perf(plsda.expr, validation = 'Mfold', folds = 3,
progressBar = FALSE,
nrepeat = 10)
plot(perf.plsda.expr, sd = TRUE, legend.position = 'horizontal')
logistic_glmnet_loc <- function(meta, expr, signature){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$LOCATION
fit <- glmnet(x, y, family = "binomial")
plot(fit, label = T)
cvfit <- cv.glmnet(x, y, family = "binomial")
plot(cvfit)
print(cvfit)
print(coef(cvfit, s = "lambda.1se"))
}
logistic_glm_loc <- function(meta, expr, signature, roc=F, summary=F, performance=F){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$LOCATION
data <- as.data.frame(x)
data$LOCATION <- 0
data$LOCATION[y=="M0"] <- 1
glm_model <- glm(LOCATION ~.,family = "binomial", data)
# prediction
model_prob = predict(glm_model, type = "response")
model_pred = ifelse(model_prob > 0.5, "M0", "MI")
train_tab = table(predicted = model_pred, actual = y)
train_con_mat = confusionMatrix(train_tab)
if(summary){
print(summary(glm_model))
}
if(roc){
roc(y ~ model_prob, plot = TRUE, print.auc = TRUE)
}
if(performance){
print(train_con_mat)
}
train_con_mat$overall["Accuracy"]
}
signature_human <- unique_ortholog_mapping[in_signature==T,ortholog_name]
logistic_glmnet_loc(meta_M0MI, expr, signature_human)
Call: cv.glmnet(x = x, y = y, family = "binomial")
Measure: Binomial Deviance
Lambda Index Measure SE Nonzero
min 0.00653 39 1.085 0.03910 22
1se 0.05055 17 1.121 0.03892 7
33 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 2.55205119
ALDOB .
APOA1 0.14498197
APOB .
APOC3 .
ARG1 .
ASNS .
CACNA1A .
CD274 .
CPS1 .
CREB3L3 0.02933740
CXCL9 .
DMGDH .
DUOX2 .
FAHD1 .
ICOS .
IDO1 .
INMT -0.10196269
KYAT3 .
NOS2 .
NOX1 .
OTC 0.07169543
PCK1 .
PDK4 .
PHGDH .
PSAT1 .
PTK6 .
RIMKLA .
SLC7A11 -0.21170216
SPIB .
STAT1 -0.27217305
TLR4 -0.05028450
TNF .
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm_loc(meta_M0MI, expr, signature_human, T, T, T)
Call:
glm(formula = LOCATION ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6509 -0.6866 0.1288 0.7103 2.5686
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 21.87211 14.56681 1.502 0.13323
ALDOB -2.40915 0.78284 -3.077 0.00209 **
APOA1 -0.39068 0.27320 -1.430 0.15271
APOB 0.20633 0.35331 0.584 0.55921
APOC3 0.36578 0.26479 1.381 0.16715
ARG1 -0.66917 0.86300 -0.775 0.43811
ASNS -0.04427 0.47465 -0.093 0.92568
CACNA1A 0.94422 0.51347 1.839 0.06593 .
CD274 -0.26257 0.32521 -0.807 0.41945
CPS1 1.70922 0.42632 4.009 6.09e-05 ***
CREB3L3 -0.21153 0.27957 -0.757 0.44927
CXCL9 -0.28598 0.22484 -1.272 0.20339
DMGDH -0.49668 0.90277 -0.550 0.58220
DUOX2 -0.07147 0.12958 -0.552 0.58128
FAHD1 -0.03254 0.53800 -0.060 0.95177
ICOS -0.15047 0.40333 -0.373 0.70910
IDO1 0.29994 0.31233 0.960 0.33688
INMT 0.25605 0.33474 0.765 0.44432
KYAT3 -0.60916 0.73320 -0.831 0.40607
NOS2 0.20450 0.28283 0.723 0.46965
NOX1 -0.88635 0.44853 -1.976 0.04814 *
OTC -1.03341 0.42154 -2.452 0.01423 *
PCK1 0.33806 0.18987 1.780 0.07500 .
PDK4 -0.12923 0.18824 -0.687 0.49238
PHGDH -0.59088 0.62564 -0.944 0.34495
PSAT1 -0.43509 0.36122 -1.205 0.22839
PTK6 0.41397 0.49678 0.833 0.40467
RIMKLA -0.18986 0.58764 -0.323 0.74663
SLC7A11 0.60062 0.27040 2.221 0.02634 *
SPIB -0.18824 0.26705 -0.705 0.48086
STAT1 0.68441 0.50918 1.344 0.17890
TLR4 0.68613 0.36408 1.885 0.05949 .
TNF -0.24646 0.42423 -0.581 0.56128
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 468.47 on 342 degrees of freedom
Residual deviance: 297.93 on 310 degrees of freedom
AIC: 363.93
Number of Fisher Scoring iterations: 7
Setting levels: control = M0, case = MI
Setting direction: controls > cases
Confusion Matrix and Statistics
actual
predicted M0 MI
M0 164 35
MI 32 112
Accuracy : 0.8047
95% CI : (0.7587, 0.8453)
No Information Rate : 0.5714
P-Value [Acc > NIR] : <2e-16
Kappa : 0.6002
Mcnemar's Test P-Value : 0.807
Sensitivity : 0.8367
Specificity : 0.7619
Pos Pred Value : 0.8241
Neg Pred Value : 0.7778
Prevalence : 0.5714
Detection Rate : 0.4781
Detection Prevalence : 0.5802
Balanced Accuracy : 0.7993
'Positive' Class : M0
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm_loc(meta_M0MI, expr, random_signature)
})
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")
(sum(random_accuracy>accuracy)+1)/(length(random_signatures)+1)
[1] 0.2217782
logistic_glmnet <- function(meta, expr, signature){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$RECURRENCE
fit <- glmnet(x, y, family = "binomial")
plot(fit, label = T)
cvfit <- cv.glmnet(x, y, family = "binomial", type.measure = "class")
plot(cvfit)
print(cvfit)
}
logistic_glm <- function(meta, expr, signature, roc=F, summary=F, performance=F){
x <- t(expr[signature, meta$CEL_FILE])
y <- meta$RECURRENCE
data <- as.data.frame(x)
data$RECURRENCE <- 0
data$RECURRENCE[y=="R"] <- 1
glm_model <- glm(RECURRENCE ~.,family = "binomial", data)
# prediction
model_prob = predict(glm_model, type = "response")
model_pred = ifelse(model_prob > 0.5, "R", "NR")
train_tab = table(predicted = model_pred, actual = y)
train_con_mat = confusionMatrix(train_tab)
if(summary){
print(summary(glm_model))
}
if(roc){
roc(y ~ model_prob, plot = TRUE, print.auc = TRUE)
}
if(performance){
print(train_con_mat)
}
train_con_mat$overall["Accuracy"]
}
signature_human <- unique_ortholog_mapping[in_signature==T,ortholog_name]
Feature selection with logistic glmnet
logistic_glmnet(meta_M0, expr, signature_human)
Call: cv.glmnet(x = x, y = y, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.06086 1 0.2908 0.0275 0
1se 0.06086 1 0.2908 0.0275 0
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm(meta_M0, expr, signature_human, T, T, T)
Call:
glm(formula = RECURRENCE ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6967 -0.8290 0.5509 0.8016 1.8942
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -15.06772 15.79400 -0.954 0.3401
ALDOB -0.92023 0.41840 -2.199 0.0278 *
APOA1 0.10998 0.27791 0.396 0.6923
APOB 0.52203 0.30590 1.707 0.0879 .
APOC3 -0.27377 0.29954 -0.914 0.3607
ARG1 0.14516 1.07260 0.135 0.8923
ASNS -0.42582 0.53167 -0.801 0.4232
CACNA1A 0.16560 0.62152 0.266 0.7899
CD274 0.07812 0.37852 0.206 0.8365
CPS1 0.21568 0.37820 0.570 0.5685
CREB3L3 0.05876 0.34063 0.173 0.8630
CXCL9 0.25207 0.28752 0.877 0.3806
DMGDH -1.29012 0.89202 -1.446 0.1481
DUOX2 -0.05173 0.18021 -0.287 0.7741
FAHD1 0.41210 0.68509 0.602 0.5475
ICOS -0.50734 0.48449 -1.047 0.2950
IDO1 -0.24146 0.40907 -0.590 0.5550
INMT 0.74965 0.38851 1.930 0.0537 .
KYAT3 1.29103 0.80823 1.597 0.1102
NOS2 -0.06091 0.35451 -0.172 0.8636
NOX1 0.52374 0.55059 0.951 0.3415
OTC -0.03520 0.42550 -0.083 0.9341
PCK1 0.16418 0.17516 0.937 0.3486
PDK4 -0.13744 0.24200 -0.568 0.5701
PHGDH -0.16188 0.64987 -0.249 0.8033
PSAT1 -0.19922 0.44249 -0.450 0.6526
PTK6 -0.55283 0.65167 -0.848 0.3963
RIMKLA 0.94240 0.71040 1.327 0.1847
SLC7A11 0.42755 0.28598 1.495 0.1349
SPIB 0.20429 0.31458 0.649 0.5161
STAT1 0.80068 0.72341 1.107 0.2684
TLR4 -0.34050 0.39892 -0.854 0.3934
TNF -0.34636 0.39681 -0.873 0.3827
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 236.33 on 195 degrees of freedom
Residual deviance: 200.82 on 163 degrees of freedom
AIC: 266.82
Number of Fisher Scoring iterations: 5
Setting levels: control = NR, case = R
Setting direction: controls < cases
Confusion Matrix and Statistics
actual
predicted NR R
NR 19 9
R 38 130
Accuracy : 0.7602
95% CI : (0.6942, 0.8182)
No Information Rate : 0.7092
P-Value [Acc > NIR] : 0.06559
Kappa : 0.316
Mcnemar's Test P-Value : 4.423e-05
Sensitivity : 0.33333
Specificity : 0.93525
Pos Pred Value : 0.67857
Neg Pred Value : 0.77381
Prevalence : 0.29082
Detection Rate : 0.09694
Detection Prevalence : 0.14286
Balanced Accuracy : 0.63429
'Positive' Class : NR
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm(meta_M0, expr, random_signature)
})
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")
Feature selection with logistic glmnet
logistic_glmnet(meta_M6, expr, signature_human)
Call: cv.glmnet(x = x, y = y, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.07089 8 0.2893 0.03628 5
1se 0.13596 1 0.2975 0.03363 0
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm(meta_M6, expr, signature_human, T, T, T)
Call:
glm(formula = RECURRENCE ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3392 -0.5410 0.2409 0.6472 1.6068
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -11.93445 27.08750 -0.441 0.65951
ALDOB 0.95027 1.21594 0.782 0.43450
APOA1 -1.62620 1.12759 -1.442 0.14925
APOB 0.95201 1.37919 0.690 0.49003
APOC3 0.27513 0.96583 0.285 0.77575
ARG1 -0.62188 2.15069 -0.289 0.77246
ASNS -1.23129 0.95416 -1.290 0.19689
CACNA1A -2.16046 1.20953 -1.786 0.07407 .
CD274 0.99056 0.86740 1.142 0.25346
CPS1 -0.13152 1.07094 -0.123 0.90226
CREB3L3 0.47578 0.61427 0.775 0.43861
CXCL9 0.18221 0.49859 0.365 0.71477
DMGDH -1.96184 2.60050 -0.754 0.45060
DUOX2 0.28120 0.28756 0.978 0.32814
FAHD1 3.23421 1.45507 2.223 0.02623 *
ICOS 0.47828 0.98646 0.485 0.62779
IDO1 -0.03772 0.79989 -0.047 0.96239
INMT -0.56813 1.15972 -0.490 0.62421
KYAT3 0.55285 1.49817 0.369 0.71212
NOS2 -1.08783 0.58724 -1.852 0.06396 .
NOX1 0.50637 0.96109 0.527 0.59828
OTC -1.05608 1.05991 -0.996 0.31906
PCK1 0.42407 0.54764 0.774 0.43872
PDK4 -1.22094 0.63454 -1.924 0.05434 .
PHGDH 5.37410 1.49335 3.599 0.00032 ***
PSAT1 -0.10896 0.70550 -0.154 0.87726
PTK6 -2.29695 1.14187 -2.012 0.04426 *
RIMKLA -0.63140 1.50560 -0.419 0.67495
SLC7A11 -0.21761 0.75281 -0.289 0.77253
SPIB -0.68279 0.81850 -0.834 0.40417
STAT1 -0.35248 0.99028 -0.356 0.72189
TLR4 -0.15202 0.82122 -0.185 0.85314
TNF 2.17205 1.21091 1.794 0.07286 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 147.317 on 120 degrees of freedom
Residual deviance: 94.893 on 88 degrees of freedom
AIC: 160.89
Number of Fisher Scoring iterations: 6
Setting levels: control = NR, case = R
Setting direction: controls < cases
Confusion Matrix and Statistics
actual
predicted NR R
NR 23 10
R 13 75
Accuracy : 0.8099
95% CI : (0.7286, 0.8755)
No Information Rate : 0.7025
P-Value [Acc > NIR] : 0.005027
Kappa : 0.5341
Mcnemar's Test P-Value : 0.676657
Sensitivity : 0.6389
Specificity : 0.8824
Pos Pred Value : 0.6970
Neg Pred Value : 0.8523
Prevalence : 0.2975
Detection Rate : 0.1901
Detection Prevalence : 0.2727
Balanced Accuracy : 0.7606
'Positive' Class : NR
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm(meta_M6, expr, random_signature)
})
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurredWarning: glm.fit: fitted probabilities numerically 0 or 1 occurred
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")
Feature selection with logistic glmnet
logistic_glmnet(meta_MI, expr, signature_human)
Call: cv.glmnet(x = x, y = y, type.measure = "class", family = "binomial")
Measure: Misclassification Error
Lambda Index Measure SE Nonzero
min 0.05889 1 0.2925 0.03973 0
1se 0.05889 1 0.2925 0.03973 0
Logistic regression with full signature
# Analysis with glm
accuracy <- logistic_glm(meta_MI, expr, signature_human, T, T, T)
Call:
glm(formula = RECURRENCE ~ ., family = "binomial", data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3934 -0.7820 0.4690 0.7313 2.2220
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.17530 24.79441 -0.168 0.8663
ALDOB 2.26021 1.58793 1.423 0.1546
APOA1 1.00416 0.53227 1.887 0.0592 .
APOB -2.01022 1.00052 -2.009 0.0445 *
APOC3 -0.30466 0.52124 -0.584 0.5589
ARG1 -1.85550 1.28265 -1.447 0.1480
ASNS 0.22660 0.75871 0.299 0.7652
CACNA1A 0.01282 0.77252 0.017 0.9868
CD274 0.56382 0.57507 0.980 0.3269
CPS1 0.53332 0.65796 0.811 0.4176
CREB3L3 -0.54022 0.47594 -1.135 0.2564
CXCL9 -0.30102 0.37615 -0.800 0.4236
DMGDH 0.41788 1.88106 0.222 0.8242
DUOX2 0.10864 0.19018 0.571 0.5678
FAHD1 0.36869 0.80406 0.459 0.6466
ICOS -1.39655 0.63234 -2.209 0.0272 *
IDO1 0.78642 0.52369 1.502 0.1332
INMT 0.43336 0.54341 0.797 0.4252
KYAT3 0.99032 1.16555 0.850 0.3955
NOS2 0.52479 0.46142 1.137 0.2554
NOX1 0.27130 0.85806 0.316 0.7519
OTC -0.83141 0.74064 -1.123 0.2616
PCK1 -0.21972 0.48399 -0.454 0.6498
PDK4 -0.11671 0.32671 -0.357 0.7209
PHGDH -0.37393 1.09492 -0.342 0.7327
PSAT1 -0.36391 0.58402 -0.623 0.5332
PTK6 0.16596 0.76056 0.218 0.8273
RIMKLA -0.51355 1.12269 -0.457 0.6474
SLC7A11 -0.46536 0.44119 -1.055 0.2915
SPIB -0.37485 0.48838 -0.768 0.4428
STAT1 -0.87887 0.81733 -1.075 0.2822
TLR4 0.45989 0.64286 0.715 0.4744
TNF 0.72373 0.98126 0.738 0.4608
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 177.69 on 146 degrees of freedom
Residual deviance: 142.78 on 114 degrees of freedom
AIC: 208.78
Number of Fisher Scoring iterations: 5
Setting levels: control = NR, case = R
Setting direction: controls < cases
Confusion Matrix and Statistics
actual
predicted NR R
NR 17 9
R 26 95
Accuracy : 0.7619
95% CI : (0.6847, 0.8282)
No Information Rate : 0.7075
P-Value [Acc > NIR] : 0.085037
Kappa : 0.3493
Mcnemar's Test P-Value : 0.006841
Sensitivity : 0.3953
Specificity : 0.9135
Pos Pred Value : 0.6538
Neg Pred Value : 0.7851
Prevalence : 0.2925
Detection Rate : 0.1156
Detection Prevalence : 0.1769
Balanced Accuracy : 0.6544
'Positive' Class : NR
Logistic regression with random signatures
random_signatures <- lapply(1:1000,function(i){
sample(unique_ortholog_mapping$ortholog_name,length(signature_human))
})
random_accuracy <- sapply(random_signatures, function(random_signature){
logistic_glm(meta_MI, expr, random_signature)
})
ggplot(data.frame(random_accuracy=random_accuracy), aes(x=random_accuracy)) +
geom_histogram(bins=30) +
geom_vline(xintercept=accuracy, color="red")